import numpy as np
from spacy.lang.en import English
import spacy
from io import StringIO


def convert(fname, threshold=.9):
    pos_tagger = English()  # part-of-speech tagger
    original_email = _read_email(fname)
    sentences = _corpus2sentences(original_email)  # convert to sentences

    # iterate through sentence, write to a new file if not signature block
    fn = fname.split(".")
    new_fname = fn[0] + "_clean." + fn[1]
    _generate_text(sentences, new_fname)


def _read_email(fname):
    with open(fname, 'r') as email:
        text = email.read()
    return text


def _corpus2sentences(corpus):
    """split corpus into a list of sentences.
    """
    return corpus.strip().split('\n')


def _generate_text(sentences, fname, threshold=0.9):
    """iterate through sentences. if the sentence is not a signature block,
    write to file.

    if probability(signature block) > threshold, then it is a signature block.

    Parameters
    ----------
    sentence : str
        Represents line in email block.
    POS_parser: obj
        Spacy English object used to tag parts-of-speech. Will explore using
        other POS taggers like NLTK's.
    fname : str
        Represents fname of new corpus, excluding signature block.
    threshold: float
        Lower thresholds will result in more false positives.
    """
    tagger = spacy.load('en_core_web_sm')
    with open(fname, "w") as new_file:
        for sentence in sentences:
            if _prob_block(sentence, tagger) < threshold:
                new_file.write(sentence)


def _prob_block(sentence, pos_tagger):
    """Calculate probability that a sentence is an email block.

    https://spacy.io/usage/linguistic-features

    Parameters
    ----------
    sentence : str
        Line in email block.

    Returns
    -------
    probability(signature block | line)
    """

    doc = pos_tagger(sentence)
    verb_count = np.sum([token.pos_ != "VERB" for token in doc])
    return float(verb_count) / len(doc) if len(doc) else 1.0


def generate_text(sentences, threshold=0.9):
    """iterate through sentences. if the sentence is not a signature block,
    write to file.

    if probability(signature block) > threshold, then it is a signature block.

    Parameters
    ----------
    sentence : str
        Represents line in email block.
    POS_parser: obj
        Spacy English object used to tag parts-of-speech. Will explore using
        other POS taggers like NLTK's.
    fname : str
        Represents fname of new corpus, excluding signature block.
    threshold: float
        Lower thresholds will result in more false positives.
    """
    tagger = spacy.load('en_core_web_sm')
    for each in sentences:
        print(each)
        print('-' * 10)
    print('-' * 80)

    new_file = StringIO()
    for sentence in sentences:
        print(_prob_block(sentence, tagger), sentence)
        if _prob_block(sentence, tagger) < threshold:
            new_file.write(sentence)
            print(sentence)
    return '\n'.join(new_file.readlines())


raw_content = '''Dear Customer,



We inform you of the change in the mock market operation hours in the member firm test plan, reflecting the change in the trading hours and order receiving time for opening single price session and pre-market hours.



A. Change in the mock market operation hours in the member firm test plan



  o Change : Mock market operation hours to be identical to the trading hours effective from April 15, 2019



  o Effective date : March 25, 2019 (Monday), April 8, 2019(Monday)

   - Test batch data transmission time : 07:20 KST (only once)



  * Except the date stated above, mock market operation hours will be applied as already described in the member firm test plan document.



Thank you.





Best Regards,

 
Ashley Park

Market Data System Department
Koscom Corporation
76, Yeouinaru-ro, Youngdeungpo-gu, 
Seoul, Korea 07329

Office: +82-2-6331-6331
Email: dataspec@koscom.co.kr
---------------------------------------------------------
This message and all attachments are confidential. Any unauthorized review, use, disclosure, or distribution is prohibited. If you believe this message has been sent to you by mistake, please notify the sender by replying to this transmission, and delete the message and its attachments without disclosing them.
'''
ss = generate_text(_corpus2sentences(raw_content))
print('-' * 80)
print(ss)
